library(xml2)
library(dplyr)
library(readr)
library(ggplot2)
library(knitr)
library(stringr)
library(tidyr)
library(kableExtra)
library(grid)
library(gridExtra)
Since information about markables is stored in XML files, these are first converted into CSV format for better handling. The following code is a function that is responsible for this conversion. It loads an XML file, takes care of the XML namespaces, and then extracts the information of the individual markables. A for-loop makes it possible to repeat this process for all XML files in the folder. The output is stored in a separate folder called ‘output_csv’ within the input folder and consists of one CSV file per XML input file.
#define the function; argument = individual XML files instantiated below
xml_to_csv <- function(xml_ind, output_f) {
#read the file via read_xml (xml2)
xml_data <- read_xml(xml_ind)
#strip default namespaces via xml_ns_stripfrom document (xml2)
xml_data <- xml_ns_strip(xml_data)
#data extraction
xml_data %>%
#retrieve all attributes values as a named character vector, bind rows and create csv-file (name derived from original XML file)
xml_find_all("markable") %>%
xml_attrs() %>%
bind_rows() %>%
write_csv(file = file.path(output_f, paste0(sub(".xml", ".csv", basename(xml_ind)))))
}
#path to input folders with XML files
folders_xml <- c("../Statistics/data/ARRAU/RST",
"../Statistics/data/ARRAU/Trains_91",
"../Statistics/data/PD/gutenberg",
"../Statistics/data/PD/wiki")
#for-loop for iterating through input folders
for (input_f in folders_xml) {
#create a list of contained XML files
all_xml_ind <- list.files(path = input_f, pattern = "\\.xml$", full.names = TRUE)
#create output folders
output_f <- file.path(input_f, "output_csv")
dir.create(output_f, showWarnings = FALSE)
for (xml_ind in all_xml_ind) {
xml_to_csv(xml_ind, output_f)
}
}
The following code creates data frames from the data of the ARRAU corpus.
#define function
create_df <- function(path_f, name_ds) {
#create a list with all CSV files
csv_list <- list.files(path_f, pattern = "\\.csv$", full.names = TRUE)
#create empty data frame
dataset <- data.frame()
#for-loop to iterate through the CSV files and extract the data
for (csv_ind in csv_list) {
csv_data <- read_csv(csv_ind)
#add the extra columns for data-set and file name
csv_data$dataset <- name_ds
csv_data$file_name <- basename(csv_ind)
#combine the data of the individual files
dataset <- bind_rows(dataset, csv_data)
}
return(dataset)
}
#define the paths to the folders containinng the CSV files
RST_path <- "../Statistics/data/ARRAU/RST/output_csv"
TRAINS91_path <- "../Statistics/data/ARRAU/Trains_91/output_csv"
#apply function to the CSV files of the RST domain; create df
RST_df <- create_df(RST_path, "RST")
#apply function to the CSV files of the TRAINS domain; create df
TRAINS91_df <- create_df(TRAINS91_path, "Trains_91")
#combine the RST_df and TRAINS91_df data frames
COMB_RST_TRAINS_df <- bind_rows(RST_df, TRAINS91_df)
The following code creates data frames from the data of the PD corpus.
#define function
create_df <- function(path_f, name_ds) {
#create a list with all CSV files
csv_list <- list.files(path_f, pattern = "\\.csv$", full.names = TRUE)
#create empty data frame
dataset <- data.frame()
#for-loop to iterate through the CSV files and extract the data
for (csv_ind in csv_list) {
csv_data <- read_csv(csv_ind)
#add the extra columns for data-set and file name
csv_data$dataset <- name_ds
csv_data$file_name <- basename(csv_ind)
#combine the data of the individual files
dataset <- bind_rows(dataset, csv_data)
}
return(dataset)
}
#define the paths to the folders containinng the CSV files
GB_path <- "../Statistics/data/PD/gutenberg/output_csv"
WIKI_path <- "../Statistics/data/PD/wiki/output_csv"
#apply function to the CSV files of the gutenberg domain; create df
GB_df <- create_df(GB_path, "GB")
#apply function to the CSV files of the wikipedia domain; create df
WIKI_df <- create_df(WIKI_path, "Wiki")
#combine the RST_df and TRAINS91_df data frames
COMB_GB_WIKI_df <- bind_rows(GB_df, WIKI_df)
## combine ARRAU and PD data; removal of columns so that both observe the same
# Remove 'disagreement_type' column from COMB_GB_WIKI_df; new df called CGW
CGW <- COMB_GB_WIKI_df[, -which(names(COMB_GB_WIKI_df) == "disagreement_type"), drop = FALSE]
# Remove multiple columns from COMB_RST_TRAINS_df; new df is called CRT
CRT <- COMB_RST_TRAINS_df[, -which(names(COMB_RST_TRAINS_df) %in% c("on_map", "on_map_2", "objectadditional")), drop = FALSE]
# Combine CRT and CGW in a new df called combined_df
combined_df <- rbind(CRT, CGW)#FOR FURTHER ANALYSIS
##sortcolumns
#general information in fixed order
general_info <- c("min_words", "min_ids", "id", "file_name", "span", "mmax_level", "dataset", "comment")
#further (gramm.) information in fixed order
info_gramm <- c("number", "gender", "person", "gram_fnc", "ambiguity")
#annotated information sorted alphabetically
all_columns <- names(combined_df)
anno_info <- setdiff(all_columns, c(general_info, info_gramm)) # Find remaining columns
anno_info_sorted <- sort(anno_info) # Sort alphabetically
# Reorder DataFrame by combining these three groups
df_sorted <- combined_df[, c(general_info, info_gramm, anno_info_sorted)]
# Make all character via list, then put it back in a data frame
df_sorted <- lapply(df_sorted, as.character)
df_sorted <- as.data.frame(df_sorted)
# View the reordered DataFrame
#print(df_sorted)
Total number of markables in the ARRAU data: 47938
Total number of markables in the PD data: 13629
Distribution of the ambiguity attribute in ARRAU:
ambiguous ambiguous_antecedent unambiguous
593 8 40452
Distribution of the ambiguity attribute in PD:
ambiguous ambiguous_antecedent unambiguous
795 7 11587
Thus, the total number of markables from both data sets amounts to 61567, with the following distribution of the ambiguity attribute:
ambiguous ambiguous_antecedent unambiguous
1388 15 52039
The categorisation that follows below will, for now, concentrate on markables annotated as being ambiguous.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## ambiguity n percentage label
## 1 ambiguous 1388 2.2544545 2.25%
## 2 ambiguous_antecedent 15 0.0243637 0.02%
## 3 unambiguous 52039 84.5241769 84.52%
## 4 <NA> 8125 13.1970049 13.20%
This section provides a small overview over the disagreement_type attribute, annotated in PD. Possible values are:
The disagreement_type value was determined by players of a game-with-a-purpose, namely Phrase detectives (cite), thus reflecting a crowdsourced annotation of possible ambiguity of a markable. A possible ‘ambiguity’ value for this attribute must not be confused with the ‘ambiguous’ value that can be found in the ambiguity attribute, which was annotated by experts, thus reflecting a gold annotation of ambiguity.
## disagreement_type n percentage label
## 1 ambiguity 488 3.58060019 3.58%
## 2 interface 894 6.55954215 6.56%
## 3 misunderstanding 149 1.09325703 1.09%
## 4 no_disagreement 12092 88.72257686 88.72%
## 5 <NA> 6 0.04402377 0.04%
The plot shows the distribution of unique values of the disagreement_type attribute for all PD markables.
## # A tibble: 3 × 3
## disagreement_type count percent
## <chr> <int> <dbl>
## 1 ambiguity 484 60.9
## 2 interface 6 0.755
## 3 no_disagreement 305 38.4
The plot shows the distribution of unique values of the disagreement_type attribute for all PD markables that are annotated as being ambiguous.
In the following, an examination of the agreement of players of PD and expert annotators regarding an ambiguity of a markable will be presented. The values ‘interface’ and ‘misunderstanding’ will be ignored for now.
## condition count percentage label
## 1 Ambiguous but no disagreement 305 2.56044325 2.56%
## 2 Ambiguous and disagreement 484 4.06312962 4.06%
## 3 Unambiguous but disagreement 3 0.02518469 0.03%
## 4 Unambiguous and no disagreement 11120 93.35124244 93.35%
The markables represented by point three are especially interesting and the unique markables will be analyzed qualitatively, since the players might have detected an ambiguity here that was either not detected by the annotators or cannot be categorised as such according to the annotation scheme. A closer look at the markables represented by point one is also at hand, which is the counterpart of point three.
#filter for only those cases that are annotated as being ambiguous
combined_df_amb <- df_sorted %>%
filter(ambiguity == "ambiguous")
###instantiate auxiliary columns
#for analysis
combined_df_amb$info_status_1 <- NA #info status of 1st reading
combined_df_amb$info_status_2 <- NA #info status of 2nd reading
combined_df_amb$amb_generic <- NA #ambiguity in generic value
combined_df_amb$amb_category <- NA #ambiguity in category value
combined_df_amb$amb_bridging <- NA #ambiguity regarding a bridging relationship
#for concatenation of the final label
combined_df_amb$amb_reason_pre <- NA #label for ambiguity regarding info status, bridging, etc. to concatenate
combined_df_amb$amb_reason_gen <- NA #label for ambiguity in generic value to concatinate
combined_df_amb$amb_reason_cat <- NA #label for ambiguity in category value to concatinate
combined_df_amb$amb_reason_fin <- NA #result of concatenating the three labels above
#info_status_1
combined_df_amb$info_status_1 <- ifelse(
combined_df_amb$reference == "old",
"DO",
ifelse(
combined_df_amb$reference == "new",
"DN",
ifelse(
combined_df_amb$reference == "non_referring",
"non_referring",
NA # Assign NA for other cases
)
)
)
#info_status_2
combined_df_amb$info_status_2 <- ifelse(
combined_df_amb$ref_type_2 %in% c("phrase", "segment"),
"DO",
ifelse(
combined_df_amb$ref_type_2 == "new",
"DN",
ifelse(
combined_df_amb$ref_type_2 == "non_referring",
"non_referring",
ifelse(
combined_df_amb$ref_type_2 == "undef_reference",
"undef_reference",
ifelse(
combined_df_amb$ref_type_2 == "undef_reference_2",
"undef_reference_2",
NA # Assign NA for other cases
)
)
)
)
)
#amb_generic
###checked: when NA was assigned for generic/generic_2 -> amb_generic = NA (51), ref_type_2 = segment (51). This means, if there is an NA in generic_2 now, "no" is assigned, since this only happens when ref_typ_2 = segment, in which case generic_2 cannot be assigned according to the scheme, thus, an ambiguity between the generic values cannot be evaluated
combined_df_amb$amb_generic <- ifelse(
is.na(combined_df_amb$generic_2) | is.na(combined_df_amb$generic),
"no", #only when ref_type_2 = segment -> generic_2 = NA
ifelse(
combined_df_amb$generic != combined_df_amb$generic_2,
"yes",
"no"
)
)
#amb_category
combined_df_amb$amb_category <- ifelse(
is.na(combined_df_amb$category) | is.na(combined_df_amb$category_2),
"no",
ifelse(
combined_df_amb$category != combined_df_amb$category_2,
"yes",
"no"
)
)
#amb_bridging
combined_df_amb$amb_bridging <- ifelse(
is.na(combined_df_amb$related_object) | is.na(combined_df_amb$related_object_2),
"no",
ifelse(
xor(combined_df_amb$related_object == "yes", combined_df_amb$related_object_2 == "yes"),
"yes",
"no"
)
)
#fill auxiliary columns for concatinating the final label
#amb_reason_gen
combined_df_amb$amb_reason_gen <- case_when(
combined_df_amb$amb_generic == "yes" ~ "generic-value",
combined_df_amb$amb_generic == "no" ~ "",
TRUE ~ "CHECK_gen" #to verify if the assignment works correctly
)
#amb_reason_cat
combined_df_amb$amb_reason_cat <- case_when(
combined_df_amb$amb_category == "yes" ~ "category",
combined_df_amb$amb_category == "no" ~ "",
TRUE ~ "CHECK_cat" #to verify if the assignment works correctly
)
###referring vs. non-referring
#df_non_ref
df_non_ref <- combined_df_amb %>%
filter(reference == "non_referring" | ref_type_2 == "non_referring") #the content of df_non_ref is always ambiguous between referring and non_referring
#assign label
df_non_ref$amb_reason_pre <- "referring vs. non_referring" #this can be done, because of the filter above
###referring vs. undefined reference
#df_ref_undef_ref
df_ref_undef_ref <- combined_df_amb %>%
filter(ref_type_2 == "undef_reference" | ref_type_2 == "undef_reference_2") #when reference = non_referring or undef_reference, no annotation as ambiguous possible
#assign label
df_ref_undef_ref$amb_reason_pre <- "referring vs. undefined_reference"
# filter only for referring mentions and exclude potential unmarked markables
combined_df_ref <- combined_df_amb %>%
filter(!(reference %in% c("unmarked", "non_referring")) & ref_type_2 != "non_referring")
###DN_DN
#categorisation using filters
#checked count of original df (df_DN_DN) and resulting df (df_DN_DN_fin) is the same
#filter for cases that are new in both readings
df_DN_DN <- combined_df_ref %>%
filter(info_status_1 == "DN" & info_status_2 == "DN")
#this filter is used to identify DN vs. DN cases. Later they will get assigned an amb for generic and/or category
#all markables are amb in either generic or category or both except one -> second reading has an object_2
df_DN_DN_only <- df_DN_DN %>%
filter(related_object == "no" & related_object_2 == "no")
#assign label
df_DN_DN_only$amb_reason_pre <- "DN vs. DN, no related objects"
#this filter is used to identify DN vs. DN+bridging cases
df_DN_DN_a <- df_DN_DN %>%
filter(amb_bridging == "yes")
#assign label
df_DN_DN_a$amb_reason_pre <- "DN vs. DN+bridging"
#this filter is used to identify cases where the right entity for bridging was not identified but the relation is the same
df_DN_DN_b <- df_DN_DN %>%
filter(related_object == "yes" & related_object_2 == "yes" & related_phrase != related_phrase_2 & related_rel == related_rel_2)
#asssign label
df_DN_DN_b$amb_reason_pre <- "identifying the right antecedent for bridging (DN)"
#this filter is used to identify cases where the right entity for bridging was not identified and the relation is not the same
df_DN_DN_c <- df_DN_DN %>%
filter(related_object == "yes" & related_object_2 == "yes" & related_phrase != related_phrase_2 & related_rel != related_rel_2)
#asssign label
df_DN_DN_c$amb_reason_pre <- "identifying the right antecedent for bridging (DN) and ambiguous relation"
#this filter is used to identify cases where the bridging relation is ambigous
df_DN_DN_d <- df_DN_DN %>%
filter(related_object == "yes" & related_object_2 == "yes" & related_phrase == related_phrase_2 & related_rel != related_rel_2)
#asssign label
df_DN_DN_d$amb_reason_pre <- "DN vs. DN; ambiguous relation"
#this filter is used for cases where neither the related phrase nor the related relation is ambiguous. However there could be cases where category or generic is amb (in fact, only one)
df_DN_DN_e <- df_DN_DN %>%
filter(related_object == "yes" & related_object_2 == "yes" & related_phrase == related_phrase_2 & related_rel == related_rel_2)
#asssign label
df_DN_DN_e$amb_reason_pre <- "DN vs. DN"
#recombine the dfs
df_DN_DN_fin <- rbind(df_DN_DN_only, df_DN_DN_a, df_DN_DN_b, df_DN_DN_c, df_DN_DN_d, df_DN_DN_e)
#### -----> exclusion of one observation: DN vs. DN, no related object. It is amb in on_map, which is not evaluated here.
df_DN_DN_fin <- df_DN_DN_fin %>%
filter(!(min_ids == "word_69" & file_name == "dia6-1_phrase_level.csv" & id == "markable_17"))
#print(df_DN_DN_fin)
###DN_DO
df_DN_DO <- combined_df_ref %>%
filter(info_status_1 == "DN" & info_status_2 == "DO")
#this filter is used to identy cases that are DO vs. DN without further attributes; single phrase antecedent
df_DN_DO_a <- df_DN_DO %>%
filter(reference == "new" & ref_type_2 == "phrase" &
phrase_antecedent_2 == "single_phrase" &
related_object == "no" & related_object_2 == "no"
)
#assign label
df_DN_DO_a$amb_reason_pre <- "DO (single) vs. DN"
#this filter is used to identy cases that are DO+bridging vs. DN; single phrase antecedent --------------------------> no such cases
# df_DN_DO_a_1 <- df_DN_DO %>%
# filter(reference == "new" & ref_type_2 == "phrase" &
# phrase_antecedent_2 == "single_phrase" &
# related_object == "no" & related_object_2 == "yes"
# )
# #assign label
# df_DN_DO_a_1$amb_reason_pre <- "DO+bridging (single) vs. DN"
#this filter is used to identy cases that are DO vs. DN+bridging single phrase antecedent
df_DN_DO_a_2 <- df_DN_DO %>%
filter(reference == "new" & ref_type_2 == "phrase" &
phrase_antecedent_2 == "single_phrase" &
related_object == "yes" & related_object_2 == "no"
)
#assign label
df_DN_DO_a_2$amb_reason_pre <- "DO (single) vs. DN+bridging"
#this filter is used to identy cases that are DO (singel) + bridging vs. DN+bridging single phrase antecedent; same relation --------------------------> no such cases
# df_DN_DO_a_3 <- df_DN_DO %>%
# filter(reference == "new" & ref_type_2 == "phrase" &
# phrase_antecedent_2 == "single_phrase" &
# related_object == "yes" & related_object_2 == "yes" &
# related_rel == related_rel_2
# )
# #assign label
# df_DN_DO_a_3$amb_reason_pre <- "DO+bridging (single) vs. DN+bridging"
#this filter is used to identy cases that are DO (singel) + bridging vs. DN+bridging single phrase antecedent; different relation --------------------------> no such cases
# df_DN_DO_a_4 <- df_DN_DO %>%
# filter(reference == "new" & ref_type_2 == "phrase" &
# phrase_antecedent_2 == "single_phrase" &
# related_object == "yes" & related_object_2 == "yes" &
# related_rel != related_rel_2
# )
# #assign label
# df_DN_DO_a_4$amb_reason_pre <- "DO+bridging (single) vs. DN+bridging; ambiguous relation"
#this filter is used to identy cases that are DO vs. DN without further attributes; multiple phrase antecedents --------------------------> no such cases
# df_DN_DO_b <- df_DN_DO %>%
# filter(reference == "new" & ref_type_2 == "phrase" &
# phrase_antecedent_2 == "multiple_phrases" &
# related_object == "no" & related_object_2 == "no"
# )
# #assign label
# df_DN_DO_b$amb_reason_pre <- "DO (multiple) vs. DN"
#this filter is used to identy cases that are DO+bridging vs. DN ; multiple phrase antecedents --------------------------> no such cases
# df_DN_DO_b_1 <- df_DN_DO %>%
# filter(reference == "new" & ref_type_2 == "phrase" &
# phrase_antecedent_2 == "multiple_phrases" &
# related_object == "no" & related_object_2 == "yes"
# )
# #assign label
# df_DN_DO_b_1$amb_reason_pre <- "DO+bridging (multiple) vs. DN"
#this filter is used to identy cases that are DO vs. DN+bridging ; multiple phrase antecedents
df_DN_DO_b_2 <- df_DN_DO %>%
filter(reference == "new" & ref_type_2 == "phrase" &
phrase_antecedent_2 == "multiple_phrases" &
related_object == "yes" & related_object_2 == "no"
)
#assign label
df_DN_DO_b_2$amb_reason_pre <- "DO (multiple) vs. DN+bridging"
# #this filter is used to identy cases that are DO+bridging vs. DN+bridging ; multiple phrase antecedents, related relation is the same --------------------------> no such cases
# df_DN_DO_b_3 <- df_DN_DO %>%
# filter(reference == "new" & ref_type_2 == "phrase" &
# phrase_antecedent == "multiple_phrases" &
# related_object == "yes" & related_object_2 == "yes" &
# related_rel == related_rel_2
# )
#assign label
# df_DN_DO_b_3$amb_reason_pre <- "DO+bridging (multiple) vs. DN+bridging"
#this filter is used to identy cases that are DO+bridging vs. DN+bridging ; multiple phrase antecedents, related relation is not the same --------------------------> no such cases
# df_DN_DO_b_4 <- df_DN_DO %>%
# filter(reference == "new" & ref_type_2 == "phrase" &
# phrase_antecedent_2 == "multiple_phrases" &
# related_object == "yes" & related_object_2 == "yes" &
# related_rel != related_rel_2
# )
# #assign label
# df_DN_DO_b_4$amb_reason_pre <- "DO+bridging (multiple) vs. DN+bridging; ambiguous related relation"
#this filter is used to identy cases that are DO+deixis vs. DN without further attributes
df_DN_DO_c <- df_DN_DO %>%
filter(reference == "new" & ref_type_2 == "segment" &
related_object == "no" & related_object_2 == "no"
)
#assign label
df_DN_DO_c$amb_reason_pre <- "deixis vs. DN"
#this filter is used to identy cases that are DO+deixis+bridging vs. DN without further attributes --------------------------> no such cases
# df_DN_DO_c_1 <- df_DN_DO %>%
# filter(reference == "new" & ref_type_2 == "segment" &
# related_object == "no" & related_object_2 == "yes"
# )
# #assign label
# df_DN_DO_c_1$amb_reason_pre <- "deixis+bridging vs. DN"
#this filter is used to identy cases that are DO+deixis vs. DN+bidging without further attributes
df_DN_DO_c_2 <- df_DN_DO %>%
filter(reference == "new" & ref_type_2 == "segment" &
related_object == "yes" & related_object_2 == "no"
)
#assign label
df_DN_DO_c_2$amb_reason_pre <- "deixis vs. DN+bridging"
#this filter is used to identy cases that are DO+deixis+bridging vs. DN+bidging without further attributes --------------------------> no such cases
# df_DN_DO_c_3 <- df_DN_DO %>%
# filter(reference == "new" & ref_type_2 == "segment" &
# related_object == "yes" & related_object_2 == "yes"
# )
# #assign label
# df_DN_DO_c_3$amb_reason_pre <- "deixis+briidging vs. DN+bridging"
# recombine the dfs
df_DN_DO_fin <- rbind(df_DN_DO_a, df_DN_DO_a_2, df_DN_DO_b_2, df_DN_DO_c, df_DN_DO_c_2)
#print(df_DN_DO_fin)
###DO_DN
#checked count of original df (df_DO_DN) and resulting df (df_DO_DN_fin) is the same
df_DO_DN <- combined_df_ref %>%
filter(info_status_1 == "DO" & info_status_2 == "DN")
#this filter is used to identy cases that are DO vs. DN without further attributes; single phrase antecedent
df_DO_DN_a <- df_DO_DN %>%
filter(ref_type == "phrase" & ref_type_2 == "new" &
phrase_antecedent == "single_phrase" &
related_object == "no" & related_object_2 == "no"
)
#assign label
df_DO_DN_a$amb_reason_pre <- "DO (single) vs. DN"
#this filter is used to identy cases that are DO+bridging vs. DN; single phrase antecedent
df_DO_DN_a_1 <- df_DO_DN %>%
filter(ref_type == "phrase" & ref_type_2 == "new" &
phrase_antecedent == "single_phrase" &
related_object == "yes" & related_object_2 == "no"
)
#assign label
df_DO_DN_a_1$amb_reason_pre <- "DO+bridging (single) vs. DN"
#this filter is used to identy cases that are DO vs. DN+bridging single phrase antecedent
df_DO_DN_a_2 <- df_DO_DN %>%
filter(ref_type == "phrase" & ref_type_2 == "new" &
phrase_antecedent == "single_phrase" &
related_object == "no" & related_object_2 == "yes"
)
#assign label
df_DO_DN_a_2$amb_reason_pre <- "DO (single) vs. DN+bridging"
#this filter is used to identy cases that are DO (singel) + bridging vs. DN+bridging single phrase antecedent; same relation
df_DO_DN_a_3 <- df_DO_DN %>%
filter(ref_type == "phrase" & ref_type_2 == "new" &
phrase_antecedent == "single_phrase" &
related_object == "yes" & related_object_2 == "yes" &
related_rel == related_rel_2
)
#assign label
df_DO_DN_a_3$amb_reason_pre <- "DO+bridging (single) vs. DN+bridging"
#this filter is used to identy cases that are DO (singel) + bridging vs. DN+bridging single phrase antecedent; different relation
df_DO_DN_a_4 <- df_DO_DN %>%
filter(ref_type == "phrase" & ref_type_2 == "new" &
phrase_antecedent == "single_phrase" &
related_object == "yes" & related_object_2 == "yes" &
related_rel != related_rel_2
)
#assign label
df_DO_DN_a_4$amb_reason_pre <- "DO+bridging (single) vs. DN+bridging; ambiguous relation"
#this filter is used to identy cases that are DO vs. DN without further attributes; multiple phrase antecedents
df_DO_DN_b <- df_DO_DN %>%
filter(ref_type == "phrase" & ref_type_2 == "new" &
phrase_antecedent == "multiple_phrases" &
related_object == "no" & related_object_2 == "no"
)
#assign label
df_DO_DN_b$amb_reason_pre <- "DO (multiple) vs. DN"
#this filter is used to identy cases that are DO+bridging vs. DN ; multiple phrase antecedents --------------------------> no such cases
# df_DO_DN_b_1 <- df_DO_DN %>%
# filter(ref_type == "phrase" & ref_type_2 == "new" &
# phrase_antecedent == "multiple_phrases" &
# related_object == "yes" & related_object_2 == "no"
# )
# #assign label
# df_DO_DN_b_1$amb_reason_pre <- "DO+bridging (multiple) vs. DN"
#this filter is used to identy cases that are DO vs. DN+bridging ; multiple phrase antecedents
df_DO_DN_b_2 <- df_DO_DN %>%
filter(ref_type == "phrase" & ref_type_2 == "new" &
phrase_antecedent == "multiple_phrases" &
related_object == "no" & related_object_2 == "yes"
)
#assign label
df_DO_DN_b_2$amb_reason_pre <- "DO (multiple) vs. DN+bridging"
# #this filter is used to identy cases that are DO+bridging vs. DN+bridging ; multiple phrase antecedents, related relation is the same --------------------------> no such cases
# df_DO_DN_b_3 <- df_DO_DN %>%
# filter(ref_type == "phrase" & ref_type_2 == "new" &
# phrase_antecedent == "multiple_phrases" &
# related_object == "yes" & related_object_2 == "yes" &
# related_rel == related_rel_2
# )
# #assign label
# df_DO_DN_b_3$amb_reason_pre <- "DO+bridging (multiple) vs. DN+bridging"
#this filter is used to identy cases that are DO+bridging vs. DN+bridging ; multiple phrase antecedents, related relation is not the same --------------------------> no such cases
# df_DO_DN_b_4 <- df_DO_DN %>%
# filter(ref_type == "phrase" & ref_type_2 == "new" &
# phrase_antecedent == "multiple_phrases" &
# related_object == "yes" & related_object_2 == "yes" &
# related_rel != related_rel_2
# )
# #assign label
# df_DO_DN_b_4$amb_reason_pre <- "DO+bridging (multiple) vs. DN+bridging; ambiguous related relation"
#this filter is used to identy cases that are DO+deixis vs. DN without further attributes
df_DO_DN_c <- df_DO_DN %>%
filter(ref_type == "segment" & ref_type_2 == "new" &
related_object == "no" & related_object_2 == "no"
)
#assign label
df_DO_DN_c$amb_reason_pre <- "deixis vs. DN"
#this filter is used to identy cases that are DO+deixis+bridging vs. DN without further attributes --------------------------> no such cases
# df_DO_DN_c_1 <- df_DO_DN %>%
# filter(ref_type == "segment" & ref_type_2 == "new" &
# related_object == "yes" & related_object_2 == "no"
# )
# #assign label
# df_DO_DN_c_1$amb_reason_pre <- "deixis+bridging vs. DN"
#this filter is used to identy cases that are DO+deixis vs. DN+bidging without further attributes
df_DO_DN_c_2 <- df_DO_DN %>%
filter(ref_type == "segment" & ref_type_2 == "new" &
related_object == "no" & related_object_2 == "yes"
)
#assign label
df_DO_DN_c_2$amb_reason_pre <- "deixis vs. DN+bridging"
#this filter is used to identy cases that are DO+deixis+bridging vs. DN+bidging without further attributes --------------------------> no such cases
# df_DO_DN_c_3 <- df_DO_DN %>%
# filter(ref_type == "segment" & ref_type_2 == "new" &
# related_object == "yes" & related_object_2 == "yes"
# )
# #assign label
# df_DO_DN_c_3$amb_reason_pre <- "deixis+briidging vs. DN+bridging"
# recombine the dfs
df_DO_DN_fin <- rbind(df_DO_DN_a, df_DO_DN_a_1, df_DO_DN_a_2, df_DO_DN_a_3, df_DO_DN_a_4, df_DO_DN_b, df_DO_DN_b_2, df_DO_DN_c, df_DO_DN_c_2)
#print(df_DO_DN_fin)
###DO-DO
df_DO_DO <- combined_df_ref %>%
filter(info_status_1 == "DO" & info_status_2 == "DO")
# filter for identifying the right antecedent
df_DO_DO_a <- df_DO_DO %>%
filter(ref_type == "phrase" & ref_type_2 == "phrase" &
phrase_antecedent == "single_phrase" & phrase_antecedent_2 == "single_phrase" &
single_phrase_antecedent != single_phrase_antecedent_2 &
related_object == "no" & related_object_2 == "no" #thus, cannot have related_phrase
)
#assign label
df_DO_DO_a$amb_reason_pre <- "identifying the right antecedent" #amb_ante_deixis and amb_bridging are always "no" in df_DO_DO_a
# filter for identifying the right antecedent + bridging in 1
df_DO_DO_b <- df_DO_DO %>%
filter(ref_type == "phrase" & ref_type_2 == "phrase" &
phrase_antecedent == "single_phrase" & phrase_antecedent_2 == "single_phrase" &
single_phrase_antecedent != single_phrase_antecedent_2 &
related_object == "yes" & related_object_2 == "no"
)
#assign label
df_DO_DO_b$amb_reason_pre <- "identifying the right antecedent + bridging in 1"
# filter for assigning DO vs. DO; amb in generic and category added later
df_DO_DO_b1 <- df_DO_DO %>%
filter(ref_type == "phrase" & ref_type_2 == "phrase" &
phrase_antecedent == "single_phrase" & phrase_antecedent_2 == "single_phrase" &
single_phrase_antecedent == single_phrase_antecedent_2 &
related_object == "no" & related_object_2 == "no"
)
#assign label
df_DO_DO_b1$amb_reason_pre <- "DO vs. DO" ### -----------------------> one observation: GB mrcnf10g word_1137; NOT ambiguous
# filter for identifying the right antecedent + bridging in 2
df_DO_DO_c <- df_DO_DO %>%
filter(ref_type == "phrase" & ref_type_2 == "phrase" &
phrase_antecedent == "single_phrase" & phrase_antecedent_2 == "single_phrase" &
single_phrase_antecedent != single_phrase_antecedent_2 &
related_object == "no" & related_object_2 == "yes"
)
#assign label
df_DO_DO_c$amb_reason_pre <- "identifying the right antecedent + bridging in 2"
# filter for identifying the right antecedent + bridging in both ---------------------------------------> no such cases
# df_DO_DO_c1 <- df_DO_DO %>%
# filter(ref_type == "phrase" & ref_type_2 == "phrase" &
# phrase_antecedent == "single_phrase" & phrase_antecedent_2 == "single_phrase" &
# single_phrase_antecedent != single_phrase_antecedent_2 &
# related_object == "yes" & related_object_2 == "yes"
# )
# filter for identifying the right antecedent for bridging ---------------------------------------> no such cases
# df_DO_DO_c2 <- df_DO_DO %>%
# filter(ref_type == "phrase" & ref_type_2 == "phrase" &
# phrase_antecedent == "single_phrase" & phrase_antecedent_2 == "single_phrase" &
# single_phrase_antecedent == single_phrase_antecedent_2 &
# related_object == "yes" & related_object_2 == "yes"
# )
# filter for antecedent vs. deixis
df_DO_DO_d <- df_DO_DO %>%
filter(ref_type == "phrase" & ref_type_2 == "segment" &
related_object == "no" & related_object_2 == "no")
#assign label
df_DO_DO_d$amb_reason_pre <- "antecedent vs. deixis"
# filter for antecedent+bridging vs. deixis ---------------------------------------> no such cases
# df_DO_DO_d_1 <- df_DO_DO %>%
# filter(ref_type == "phrase" & ref_type_2 == "segment" &
# related_object == "yes" & related_object_2 == "no")
# #assign label
# df_DO_DO_d_2$amb_reason_pre <- "antecedent+bridging vs. deixis"
# filter for antecedent vs. deixis+bridging ---------------------------------------> no such cases
# df_DO_DO_d_2 <- df_DO_DO %>%
# filter(ref_type == "phrase" & ref_type_2 == "segment" &
# related_object == "no" & related_object_2 == "yes")
# #assign label
# df_DO_DO_d_2$amb_reason_pre <- "antecedent vs. deixis+bridging"
# filter for antecedent+brdiging vs. deixis+bridging ---------------------------------------> no such cases
# df_DO_DO_d_3 <- df_DO_DO %>%
# filter(ref_type == "phrase" & ref_type_2 == "segment" &
# related_object == "no" & related_object_2 == "yes")
# #assign label
# df_DO_DO_d_3$amb_reason_pre <- "antecedent+bridging vs. deixis+bridging"
# filter for antecedent vs. deixis
df_DO_DO_e <- df_DO_DO %>%
filter(ref_type == "segment" & ref_type_2 == "phrase" &
related_object == "no" & related_object_2 == "no"
)
#assign label
df_DO_DO_e$amb_reason_pre <- "antecedent vs. deixis"
# filter for antecedent vs. deixis+related_object
df_DO_DO_e1 <- df_DO_DO %>%
filter(ref_type == "segment" & ref_type_2 == "phrase" &
related_object == "yes" & related_object_2 == "no"
)
#assign label
df_DO_DO_e1$amb_reason_pre <- "antecedent vs. deixis+bridging"
# filter for antecedent+related_object vs. deixis
df_DO_DO_e2 <- df_DO_DO %>%
filter(ref_type == "segment" & ref_type_2 == "phrase" &
related_object == "no" & related_object_2 == "yes"
)
#assign label
df_DO_DO_e2$amb_reason_pre <- "antecedent+bridging vs. deixis"
# filter for antecedent+related_object vs. deixis+related object ----------------------------------------> no such cases
# df_DO_DO_e3 <- df_DO_DO %>%
# filter(ref_type == "segment" & ref_type_2 == "phrase" &
# related_object == "yes" & related_object_2 == "yes"
# )
# #assign label
# df_DO_DO_e3$amb_reason_pre <- "antecedent+related_object vs. deixis"+related object
# filter for amb in deixis antecedent
df_DO_DO_f <- df_DO_DO %>%
filter(ref_type == "segment" & ref_type_2 == "segment" &
segment_antecedent != segment_antecedent_2 &
related_object == "no" & related_object_2 == "no"
)
#assign label
df_DO_DO_f$amb_reason_pre <- "identifying the right segment for deixis"
# filter for amb in deixis antecedent + related object in 1
df_DO_DO_f1 <- df_DO_DO %>%
filter(ref_type == "segment" & ref_type_2 == "segment" &
segment_antecedent != segment_antecedent_2 &
related_object == "yes" & related_object_2 == "no"
)
df_DO_DO_f1$amb_reason_pre <- "identifying the right segment for deixis + related_object in 1"
# filter for amb in deixis antecedent + related object in 2 ----------------------------------------> no such cases
# df_DO_DO_f2 <- df_DO_DO %>%
# filter(ref_type == "segment" & ref_type_2 == "segment" &
# segment_antecedent != segment_antecedent_2 &
# related_object == "no" & related_object_2 == "yes"
# )
#assign label
# df_DO_DO_f2$amb_reason_pre <- "identifying the right segment for deixis + related_object in 2"
# filter for amb in deixis antecedent + related object in both ---------------------------------------> no such cases
# df_DO_DO_f3 <- df_DO_DO %>%
# filter(ref_type == "segment" & ref_type_2 == "segment" &
# segment_antecedent != segment_antecedent_2 &
# related_object == "yes" & related_object_2 == "yes"
# )
# #assign label
# df_DO_DO_f3$amb_reason_pre <- "identifying the right segment for deixis + related_object in both"
# filter for single vs. multiple antecedents
df_DO_DO_g <- df_DO_DO %>%
filter(ref_type == "phrase" & ref_type_2 == "phrase" &
phrase_antecedent == "multiple_phrases" & phrase_antecedent_2 == "single_phrase" &
related_object == "no" & related_object_2 == "no"
)
#assign label
df_DO_DO_g$amb_reason_pre <- "single vs. multiple antecedents"
# filter for single vs. multiple antecedents
df_DO_DO_g4 <- df_DO_DO %>%
filter(ref_type == "phrase" & ref_type_2 == "phrase" &
phrase_antecedent == "single_phrase" & phrase_antecedent_2 == "multiple_phrases" &
related_object == "no" & related_object_2 == "no"
)
#assign label
df_DO_DO_g4$amb_reason_pre <- "single vs. multiple antecedents"
# filter for single vs. multiple antecedents and related object in 1 ---------------------------------------> no such cases
# df_DO_DO_g1 <- df_DO_DO %>%
# filter(ref_type == "phrase" & ref_type_2 == "phrase" &
# phrase_antecedent == "multiple_phrases" & phrase_antecedent_2 == "single_phrase" &
# related_object == "yes" & related_object_2 == "no"
# )
# #assign label
# df_DO_DO_g1$amb_reason_pre <- "single vs. multiple antecedents and related object in 1"
# filter for single vs. multiple antecedents and related object in 2 ---------------------------------------> no such cases
# df_DO_DO_g2 <- df_DO_DO %>%
# filter(ref_type == "phrase" & ref_type_2 == "phrase" &
# phrase_antecedent == "multiple_phrases" & phrase_antecedent_2 == "single_phrase" &
# related_object == "no" & related_object_2 == "yes"
# )
# #assign label
# df_DO_DO_g2$amb_reason_pre <- "single vs. multiple antecedents and related object in 2"
# filter for single vs. multiple antecedents and related object in both ---------------------------------------> no such cases
# df_DO_DO_g3 <- df_DO_DO %>%
# filter(ref_type == "phrase" & ref_type_2 == "phrase" &
# phrase_antecedent == "multiple_phrases" & phrase_antecedent_2 == "single_phrase" &
# related_object == "yes" & related_object_2 == "yes"
# )
# #assign label
# df_DO_DO_g3$amb_reason_pre <- "single vs. multiple antecedents and related object in both"
# filter for identifying the right antecedents
df_DO_DO_h <- df_DO_DO %>%
filter(ref_type == "phrase" & ref_type_2 == "phrase" &
phrase_antecedent == "multiple_phrases" & phrase_antecedent_2 == "multiple_phrases" &
multiple_phrase_antecedents != multiple_phrase_antecedents_2 &
related_object == "no" & related_object_2 == "no"
)
#assign label
df_DO_DO_h$amb_reason_pre <- "identifying the right antecedents"
# filter for identifying the right antecedents + related_object in 1 ---------------------------------------> no such cases
# df_DO_DO_h1 <- df_DO_DO %>%
# filter(ref_type == "phrase" & ref_type_2 == "phrase" &
# phrase_antecedent == "multiple_phrases" & phrase_antecedent_2 == "multiple_phrases" &
# multiple_phrase_antecedents != multiple_phrase_antecedents_2 &
# related_object == "yes" & related_object_2 == "no"
# )
# #assign label
# df_DO_DO_h1$amb_reason_pre <- "identifying the right antecedents + related_object in 1"
# filter for identifying the right antecedents + related_object in 2 ---------------------------------------> no such cases
# df_DO_DO_h2 <- df_DO_DO %>%
# filter(ref_type == "phrase" & ref_type_2 == "phrase" &
# phrase_antecedent == "multiple_phrases" & phrase_antecedent_2 == "multiple_phrases" &
# multiple_phrase_antecedents != multiple_phrase_antecedents_2 &
# related_object == "no" & related_object_2 == "yes"
# )
# #assign label
# df_DO_DO_h2$amb_reason_pre <- "identifying the right antecedents + related_object in 2"
# filter for identifying the right antecedents + related_object in both ---------------------------------------> no such cases
# df_DO_DO_h3 <- df_DO_DO %>%
# filter(ref_type == "phrase" & ref_type_2 == "phrase" &
# phrase_antecedent == "multiple_phrases" & phrase_antecedent_2 == "multiple_phrases" &
# multiple_phrase_antecedents == multiple_phrase_antecedents_2 &
# related_object == "yes" & related_object_2 == "yes"
# )
# #assign label
# df_DO_DO_h3$amb_reason_pre <- "identifying the right antecedents + related_object in both"
### combinig the dfs; df_DO_DO_b1 is not included, since one observation that is NOT ambiguous DO vs. DO
df_DO_DO_fin <- rbind(df_DO_DO_a, df_DO_DO_b, df_DO_DO_c, df_DO_DO_d, df_DO_DO_e, df_DO_DO_e1, df_DO_DO_e2, df_DO_DO_f, df_DO_DO_f1, df_DO_DO_g, df_DO_DO_g4, df_DO_DO_h)
#recombine the categorised dfs of referring markables
df_categorised_referring <- rbind(df_DN_DN_fin, df_DO_DO_fin, df_DO_DN_fin, df_DN_DO_fin, df_ref_undef_ref)
#print(df_categorised_referring)
#recombine data frames
df_categorised_1 <- rbind(df_categorised_referring, df_non_ref)
#print(df_categorised_1)
#check if a markable does not appear twice or more in the df
double_markables <- df_categorised_1 %>%
group_by(file_name) %>% # group by the file the markable appears in
filter(duplicated(id) | duplicated(id, fromLast = TRUE)) # use function to check for duplicate ids
print(double_markables) #if this data frame is empty, there are no markables that appear twice in the original df
## # A tibble: 0 × 47
## # Groups: file_name [0]
## # ℹ 47 variables: min_words <chr>, min_ids <chr>, id <chr>, file_name <chr>,
## # span <chr>, mmax_level <chr>, dataset <chr>, comment <chr>, number <chr>,
## # gender <chr>, person <chr>, gram_fnc <chr>, ambiguity <chr>,
## # category <chr>, category_2 <chr>, generic <chr>, generic_2 <chr>,
## # multiple_phrase_antecedents <chr>, multiple_phrase_antecedents_2 <chr>,
## # non_ref_type <chr>, non_ref_type_2 <chr>, object <chr>, object_2 <chr>,
## # phrase_antecedent <chr>, phrase_antecedent_2 <chr>, ref_type <chr>, …
#create final data frame
df_categorised_fin <- df_categorised_1
#get rid of the superfluous semicolons
concatenate_with_condition <- function(row, columns, separator = "; ", include_empty = FALSE) {
selected_row <- row[columns]
if (!include_empty) {
selected_row <- selected_row[selected_row != ""]
}
result <- paste(selected_row, collapse = separator)
return(result)
}
#specify columns to use
columns_to_use <- c("amb_reason_pre", "amb_reason_gen", "amb_reason_cat")
#concatenate and put in amb_reason_fin
df_categorised_fin$amb_reason_fin <- apply(df_categorised_fin[, columns_to_use], 1, concatenate_with_condition,
columns = columns_to_use, separator = "; ", include_empty = FALSE)
## [1] "List of labels and their count in ARRAU and PD:"
##
## antecedent vs. deixis
## 21
## antecedent vs. deixis; category
## 15
## antecedent vs. deixis; generic-value
## 1
## antecedent vs. deixis+bridging
## 1
## antecedent+bridging vs. deixis
## 1
## deixis vs. DN
## 2
## deixis vs. DN+bridging
## 12
## deixis vs. DN+bridging; category
## 3
## deixis vs. DN+bridging; generic-value
## 1
## DN vs. DN, no related objects; category
## 8
## DN vs. DN, no related objects; generic-value
## 19
## DN vs. DN, no related objects; generic-value; category
## 5
## DN vs. DN; ambiguous relation
## 1
## DN vs. DN; ambiguous relation; generic-value
## 2
## DN vs. DN; generic-value
## 1
## DN vs. DN+bridging
## 1
## DN vs. DN+bridging; category
## 11
## DN vs. DN+bridging; generic-value
## 9
## DN vs. DN+bridging; generic-value; category
## 2
## DO (multiple) vs. DN
## 3
## DO (multiple) vs. DN; category
## 1
## DO (multiple) vs. DN; generic-value
## 1
## DO (multiple) vs. DN+bridging
## 4
## DO (multiple) vs. DN+bridging; category
## 2
## DO (multiple) vs. DN+bridging; generic-value
## 3
## DO (multiple) vs. DN+bridging; generic-value; category
## 2
## DO (single) vs. DN
## 115
## DO (single) vs. DN; category
## 63
## DO (single) vs. DN; generic-value
## 84
## DO (single) vs. DN; generic-value; category
## 11
## DO (single) vs. DN+bridging
## 259
## DO (single) vs. DN+bridging; category
## 39
## DO (single) vs. DN+bridging; generic-value
## 78
## DO (single) vs. DN+bridging; generic-value; category
## 14
## DO+bridging (single) vs. DN
## 4
## DO+bridging (single) vs. DN; generic-value
## 1
## DO+bridging (single) vs. DN+bridging
## 1
## DO+bridging (single) vs. DN+bridging; ambiguous relation
## 1
## DO+bridging (single) vs. DN+bridging; ambiguous relation; category
## 2
## identifying the right antecedent
## 131
## identifying the right antecedent + bridging in 1
## 3
## identifying the right antecedent + bridging in 1; category
## 1
## identifying the right antecedent + bridging in 2
## 1
## identifying the right antecedent + bridging in 2; generic-value
## 2
## identifying the right antecedent for bridging (DN)
## 5
## identifying the right antecedent for bridging (DN) and ambiguous relation
## 2
## identifying the right antecedent for bridging (DN) and ambiguous relation; generic-value
## 1
## identifying the right antecedent for bridging (DN); generic-value
## 1
## identifying the right antecedent; category
## 43
## identifying the right antecedent; generic-value
## 23
## identifying the right antecedent; generic-value; category
## 5
## identifying the right antecedents
## 5
## identifying the right antecedents; category
## 2
## identifying the right segment for deixis
## 21
## identifying the right segment for deixis + related_object in 1
## 1
## identifying the right segment for deixis; category
## 1
## referring vs. non_referring
## 282
## referring vs. undefined_reference
## 5
## single vs. multiple antecedents
## 24
## single vs. multiple antecedents; category
## 7
## single vs. multiple antecedents; generic-value
## 9
## single vs. multiple antecedents; generic-value; category
## 1
## amb_reason_fin
## 1 DN vs. DN+bridging
## 2 DN vs. DN+bridging; category
## 3 DN vs. DN+bridging; generic-value
## 4 DN vs. DN+bridging; generic-value; category
## 5 DN vs. DN, no related objects; category
## 6 DN vs. DN, no related objects; generic-value
## 7 DN vs. DN, no related objects; generic-value; category
## 8 DN vs. DN; ambiguous relation
## 9 DN vs. DN; ambiguous relation; generic-value
## 10 DN vs. DN; generic-value
## 11 DO (multiple) vs. DN
## 12 DO (multiple) vs. DN+bridging
## 13 DO (multiple) vs. DN+bridging; category
## 14 DO (multiple) vs. DN+bridging; generic-value
## 15 DO (multiple) vs. DN+bridging; generic-value; category
## 16 DO (multiple) vs. DN; category
## 17 DO (multiple) vs. DN; generic-value
## 18 DO (single) vs. DN
## 19 DO (single) vs. DN+bridging
## 20 DO (single) vs. DN+bridging; category
## 21 DO (single) vs. DN+bridging; generic-value
## 22 DO (single) vs. DN+bridging; generic-value; category
## 23 DO (single) vs. DN; category
## 24 DO (single) vs. DN; generic-value
## 25 DO (single) vs. DN; generic-value; category
## 26 DO+bridging (single) vs. DN
## 27 DO+bridging (single) vs. DN+bridging
## 28 DO+bridging (single) vs. DN+bridging; ambiguous relation
## 29 DO+bridging (single) vs. DN+bridging; ambiguous relation; category
## 30 DO+bridging (single) vs. DN; generic-value
## 31 antecedent vs. deixis
## 32 antecedent vs. deixis+bridging
## 33 antecedent vs. deixis; category
## 34 antecedent vs. deixis; generic-value
## 35 antecedent+bridging vs. deixis
## 36 deixis vs. DN
## 37 deixis vs. DN+bridging
## 38 deixis vs. DN+bridging; category
## 39 deixis vs. DN+bridging; generic-value
## 40 identifying the right antecedent
## 41 identifying the right antecedent + bridging in 1
## 42 identifying the right antecedent + bridging in 1; category
## 43 identifying the right antecedent + bridging in 2
## 44 identifying the right antecedent + bridging in 2; generic-value
## 45 identifying the right antecedent for bridging (DN)
## 46 identifying the right antecedent for bridging (DN) and ambiguous relation
## 47 identifying the right antecedent for bridging (DN) and ambiguous relation; generic-value
## 48 identifying the right antecedent for bridging (DN); generic-value
## 49 identifying the right antecedent; category
## 50 identifying the right antecedent; generic-value
## 51 identifying the right antecedent; generic-value; category
## 52 identifying the right antecedents
## 53 identifying the right antecedents; category
## 54 identifying the right segment for deixis
## 55 identifying the right segment for deixis + related_object in 1
## 56 identifying the right segment for deixis; category
## 57 referring vs. non_referring
## 58 referring vs. undefined_reference
## 59 single vs. multiple antecedents
## 60 single vs. multiple antecedents; category
## 61 single vs. multiple antecedents; generic-value
## 62 single vs. multiple antecedents; generic-value; category
## n percentage
## 1 1 0.07246377
## 2 11 0.79710145
## 3 9 0.65217391
## 4 2 0.14492754
## 5 8 0.57971014
## 6 19 1.37681159
## 7 5 0.36231884
## 8 1 0.07246377
## 9 2 0.14492754
## 10 1 0.07246377
## 11 3 0.21739130
## 12 4 0.28985507
## 13 2 0.14492754
## 14 3 0.21739130
## 15 2 0.14492754
## 16 1 0.07246377
## 17 1 0.07246377
## 18 115 8.33333333
## 19 259 18.76811594
## 20 39 2.82608696
## 21 78 5.65217391
## 22 14 1.01449275
## 23 63 4.56521739
## 24 84 6.08695652
## 25 11 0.79710145
## 26 4 0.28985507
## 27 1 0.07246377
## 28 1 0.07246377
## 29 2 0.14492754
## 30 1 0.07246377
## 31 21 1.52173913
## 32 1 0.07246377
## 33 15 1.08695652
## 34 1 0.07246377
## 35 1 0.07246377
## 36 2 0.14492754
## 37 12 0.86956522
## 38 3 0.21739130
## 39 1 0.07246377
## 40 131 9.49275362
## 41 3 0.21739130
## 42 1 0.07246377
## 43 1 0.07246377
## 44 2 0.14492754
## 45 5 0.36231884
## 46 2 0.14492754
## 47 1 0.07246377
## 48 1 0.07246377
## 49 43 3.11594203
## 50 23 1.66666667
## 51 5 0.36231884
## 52 5 0.36231884
## 53 2 0.14492754
## 54 21 1.52173913
## 55 1 0.07246377
## 56 1 0.07246377
## 57 282 20.43478261
## 58 5 0.36231884
## 59 24 1.73913043
## 60 7 0.50724638
## 61 9 0.65217391
## 62 1 0.07246377
## amb_reason_fin n percentage
## 1 DN vs. DN, no related objects; generic-value 19 1.376812
## 2 DO (single) vs. DN+bridging; category 39 2.826087
## 3 DO (single) vs. DN+bridging; generic-value; category 14 1.014493
## 4 antecedent vs. deixis 21 1.521739
## 5 antecedent vs. deixis; category 15 1.086957
## 6 identifying the right antecedent; generic-value 23 1.666667
## 7 identifying the right segment for deixis 21 1.521739
## 8 single vs. multiple antecedents 24 1.739130
The plots show the distribution of reasons for ambiguity in percent for all markables annotated as being ambiguous in PD and ARRAU.
## amb_reason_fin
## 1 DN vs. DN+bridging
## 2 DN vs. DN+bridging; category
## 3 DN vs. DN+bridging; generic-value
## 4 DN vs. DN+bridging; generic-value; category
## 5 DN vs. DN, no related objects; category
## 6 DN vs. DN, no related objects; generic-value
## 7 DN vs. DN, no related objects; generic-value; category
## 8 DN vs. DN; ambiguous relation
## 9 DN vs. DN; ambiguous relation; generic-value
## 10 DN vs. DN; generic-value
## 11 DO (multiple) vs. DN
## 12 DO (multiple) vs. DN+bridging
## 13 DO (multiple) vs. DN+bridging; category
## 14 DO (multiple) vs. DN+bridging; generic-value
## 15 DO (multiple) vs. DN+bridging; generic-value; category
## 16 DO (multiple) vs. DN; category
## 17 DO (multiple) vs. DN; generic-value
## 18 DO (single) vs. DN
## 19 DO (single) vs. DN+bridging
## 20 DO (single) vs. DN+bridging; category
## 21 DO (single) vs. DN+bridging; generic-value
## 22 DO (single) vs. DN+bridging; generic-value; category
## 23 DO (single) vs. DN; category
## 24 DO (single) vs. DN; generic-value
## 25 DO (single) vs. DN; generic-value; category
## 26 DO+bridging (single) vs. DN
## 27 DO+bridging (single) vs. DN+bridging
## 28 DO+bridging (single) vs. DN+bridging; ambiguous relation
## 29 DO+bridging (single) vs. DN+bridging; ambiguous relation; category
## 30 DO+bridging (single) vs. DN; generic-value
## 31 antecedent vs. deixis
## 32 antecedent vs. deixis+bridging
## 33 antecedent vs. deixis; category
## 34 antecedent vs. deixis; generic-value
## 35 antecedent+bridging vs. deixis
## 36 deixis vs. DN
## 37 deixis vs. DN+bridging
## 38 deixis vs. DN+bridging; category
## 39 deixis vs. DN+bridging; generic-value
## 40 identifying the right antecedent
## 41 identifying the right antecedent + bridging in 1
## 42 identifying the right antecedent + bridging in 1; category
## 43 identifying the right antecedent + bridging in 2
## 44 identifying the right antecedent + bridging in 2; generic-value
## 45 identifying the right antecedent for bridging (DN)
## 46 identifying the right antecedent for bridging (DN) and ambiguous relation
## 47 identifying the right antecedent for bridging (DN) and ambiguous relation; generic-value
## 48 identifying the right antecedent for bridging (DN); generic-value
## 49 identifying the right antecedent; category
## 50 identifying the right antecedent; generic-value
## 51 identifying the right antecedent; generic-value; category
## 52 identifying the right antecedents
## 53 identifying the right antecedents; category
## 54 identifying the right segment for deixis
## 55 identifying the right segment for deixis + related_object in 1
## 56 identifying the right segment for deixis; category
## 57 referring vs. non_referring
## 58 referring vs. undefined_reference
## 59 single vs. multiple antecedents
## 60 single vs. multiple antecedents; category
## 61 single vs. multiple antecedents; generic-value
## 62 single vs. multiple antecedents; generic-value; category
## n percentage
## 1 1 0.07246377
## 2 11 0.79710145
## 3 9 0.65217391
## 4 2 0.14492754
## 5 8 0.57971014
## 6 19 1.37681159
## 7 5 0.36231884
## 8 1 0.07246377
## 9 2 0.14492754
## 10 1 0.07246377
## 11 3 0.21739130
## 12 4 0.28985507
## 13 2 0.14492754
## 14 3 0.21739130
## 15 2 0.14492754
## 16 1 0.07246377
## 17 1 0.07246377
## 18 115 8.33333333
## 19 259 18.76811594
## 20 39 2.82608696
## 21 78 5.65217391
## 22 14 1.01449275
## 23 63 4.56521739
## 24 84 6.08695652
## 25 11 0.79710145
## 26 4 0.28985507
## 27 1 0.07246377
## 28 1 0.07246377
## 29 2 0.14492754
## 30 1 0.07246377
## 31 21 1.52173913
## 32 1 0.07246377
## 33 15 1.08695652
## 34 1 0.07246377
## 35 1 0.07246377
## 36 2 0.14492754
## 37 12 0.86956522
## 38 3 0.21739130
## 39 1 0.07246377
## 40 131 9.49275362
## 41 3 0.21739130
## 42 1 0.07246377
## 43 1 0.07246377
## 44 2 0.14492754
## 45 5 0.36231884
## 46 2 0.14492754
## 47 1 0.07246377
## 48 1 0.07246377
## 49 43 3.11594203
## 50 23 1.66666667
## 51 5 0.36231884
## 52 5 0.36231884
## 53 2 0.14492754
## 54 21 1.52173913
## 55 1 0.07246377
## 56 1 0.07246377
## 57 282 20.43478261
## 58 5 0.36231884
## 59 24 1.73913043
## 60 7 0.50724638
## 61 9 0.65217391
## 62 1 0.07246377
The plot shows the distribution of assigned labels for the reason of ambiguity with percentages smaller than 0.5% confflated into the category “other”.
# split data for evaluation of different corpora and their domains
#RST domain
df_categorised_RST <- df_categorised_fin %>%
filter(dataset == "RST")
#Trains domain
df_categorised_Trains <- df_categorised_fin %>%
filter(dataset == "Trains_91")
#ARRAU
df_categorised_ARRAU <- rbind(df_categorised_RST, df_categorised_Trains)
#Gutenberg domain
df_categorised_gutenberg <- df_categorised_fin %>%
filter(dataset == "GB")
#Wikipedia domain
df_categorised_wiki <- df_categorised_fin %>%
filter(dataset == "Wiki")
#PD corpus
df_categorised_PD <- rbind(df_categorised_gutenberg, df_categorised_wiki)
## amb_reason_fin n
## 1 DN vs. DN+bridging 1
## 2 DN vs. DN+bridging; category 4
## 3 DN vs. DN+bridging; generic-value 7
## 4 DN vs. DN+bridging; generic-value; category 1
## 5 DN vs. DN, no related objects; category 2
## 6 DN vs. DN, no related objects; generic-value 16
## 7 DN vs. DN, no related objects; generic-value; category 1
## 8 DN vs. DN; ambiguous relation; generic-value 2
## 9 DN vs. DN; generic-value 1
## 10 DO (multiple) vs. DN 2
## 11 DO (multiple) vs. DN+bridging 3
## 12 DO (multiple) vs. DN+bridging; generic-value; category 1
## 13 DO (single) vs. DN 33
## 14 DO (single) vs. DN+bridging 79
## 15 DO (single) vs. DN+bridging; category 18
## 16 DO (single) vs. DN+bridging; generic-value 36
## 17 DO (single) vs. DN+bridging; generic-value; category 8
## 18 DO (single) vs. DN; category 10
## 19 DO (single) vs. DN; generic-value 62
## 20 DO (single) vs. DN; generic-value; category 7
## 21 DO+bridging (single) vs. DN 3
## 22 DO+bridging (single) vs. DN+bridging; ambiguous relation; category 1
## 23 antecedent vs. deixis 12
## 24 antecedent vs. deixis+bridging 1
## 25 antecedent vs. deixis; category 7
## 26 antecedent vs. deixis; generic-value 1
## 27 deixis vs. DN 1
## 28 deixis vs. DN+bridging 4
## 29 deixis vs. DN+bridging; category 1
## 30 deixis vs. DN+bridging; generic-value 1
## 31 identifying the right antecedent 43
## 32 identifying the right antecedent for bridging (DN) 3
## 33 identifying the right antecedent for bridging (DN) and ambiguous relation 1
## 34 identifying the right antecedent for bridging (DN); generic-value 1
## 35 identifying the right antecedent; category 8
## 36 identifying the right antecedent; generic-value 7
## 37 identifying the right antecedent; generic-value; category 2
## 38 identifying the right antecedents 2
## 39 identifying the right antecedents; category 1
## 40 identifying the right segment for deixis 13
## 41 identifying the right segment for deixis + related_object in 1 1
## 42 referring vs. non_referring 14
## 43 referring vs. undefined_reference 2
## 44 single vs. multiple antecedents 1
## 45 single vs. multiple antecedents; category 2
## percentage dataset
## 1 0.2341920 RST
## 2 0.9367681 RST
## 3 1.6393443 RST
## 4 0.2341920 RST
## 5 0.4683841 RST
## 6 3.7470726 RST
## 7 0.2341920 RST
## 8 0.4683841 RST
## 9 0.2341920 RST
## 10 0.4683841 RST
## 11 0.7025761 RST
## 12 0.2341920 RST
## 13 7.7283372 RST
## 14 18.5011710 RST
## 15 4.2154567 RST
## 16 8.4309133 RST
## 17 1.8735363 RST
## 18 2.3419204 RST
## 19 14.5199063 RST
## 20 1.6393443 RST
## 21 0.7025761 RST
## 22 0.2341920 RST
## 23 2.8103044 RST
## 24 0.2341920 RST
## 25 1.6393443 RST
## 26 0.2341920 RST
## 27 0.2341920 RST
## 28 0.9367681 RST
## 29 0.2341920 RST
## 30 0.2341920 RST
## 31 10.0702576 RST
## 32 0.7025761 RST
## 33 0.2341920 RST
## 34 0.2341920 RST
## 35 1.8735363 RST
## 36 1.6393443 RST
## 37 0.4683841 RST
## 38 0.4683841 RST
## 39 0.2341920 RST
## 40 3.0444965 RST
## 41 0.2341920 RST
## 42 3.2786885 RST
## 43 0.4683841 RST
## 44 0.2341920 RST
## 45 0.4683841 RST
## amb_reason_fin n percentage
## 1 DN vs. DN, no related objects; category 1 0.625
## 2 DN vs. DN, no related objects; generic-value 2 1.250
## 3 DO (multiple) vs. DN+bridging 1 0.625
## 4 DO (multiple) vs. DN+bridging; generic-value 1 0.625
## 5 DO (multiple) vs. DN; generic-value 1 0.625
## 6 DO (single) vs. DN 7 4.375
## 7 DO (single) vs. DN+bridging 14 8.750
## 8 DO (single) vs. DN+bridging; category 3 1.875
## 9 DO (single) vs. DN+bridging; generic-value 16 10.000
## 10 DO (single) vs. DN+bridging; generic-value; category 2 1.250
## 11 DO (single) vs. DN; category 1 0.625
## 12 DO (single) vs. DN; generic-value 7 4.375
## 13 DO+bridging (single) vs. DN; generic-value 1 0.625
## 14 antecedent vs. deixis 1 0.625
## 15 antecedent vs. deixis; category 6 3.750
## 16 identifying the right antecedent 20 12.500
## 17 identifying the right antecedent + bridging in 2 1 0.625
## 18 identifying the right antecedent; category 3 1.875
## 19 identifying the right antecedent; generic-value 6 3.750
## 20 identifying the right antecedent; generic-value; category 1 0.625
## 21 identifying the right antecedents 2 1.250
## 22 identifying the right segment for deixis 8 5.000
## 23 referring vs. non_referring 26 16.250
## 24 referring vs. undefined_reference 3 1.875
## 25 single vs. multiple antecedents 14 8.750
## 26 single vs. multiple antecedents; category 3 1.875
## 27 single vs. multiple antecedents; generic-value 8 5.000
## 28 single vs. multiple antecedents; generic-value; category 1 0.625
## dataset
## 1 Trains
## 2 Trains
## 3 Trains
## 4 Trains
## 5 Trains
## 6 Trains
## 7 Trains
## 8 Trains
## 9 Trains
## 10 Trains
## 11 Trains
## 12 Trains
## 13 Trains
## 14 Trains
## 15 Trains
## 16 Trains
## 17 Trains
## 18 Trains
## 19 Trains
## 20 Trains
## 21 Trains
## 22 Trains
## 23 Trains
## 24 Trains
## 25 Trains
## 26 Trains
## 27 Trains
## 28 Trains
The plot compares the distribution of reasons for ambiguity between the two domains of ARRAU. If only one bar is shown per label, the percentage of it in the other domain is below the threshold of 0.8%
## amb_reason_fin n
## 1 DN vs. DN+bridging 1
## 2 DN vs. DN+bridging; category 4
## 3 DN vs. DN+bridging; generic-value 7
## 4 DN vs. DN+bridging; generic-value; category 1
## 5 DN vs. DN, no related objects; category 3
## 6 DN vs. DN, no related objects; generic-value 18
## 7 DN vs. DN, no related objects; generic-value; category 1
## 8 DN vs. DN; ambiguous relation; generic-value 2
## 9 DN vs. DN; generic-value 1
## 10 DO (multiple) vs. DN 2
## 11 DO (multiple) vs. DN+bridging 4
## 12 DO (multiple) vs. DN+bridging; generic-value 1
## 13 DO (multiple) vs. DN+bridging; generic-value; category 1
## 14 DO (multiple) vs. DN; generic-value 1
## 15 DO (single) vs. DN 40
## 16 DO (single) vs. DN+bridging 93
## 17 DO (single) vs. DN+bridging; category 21
## 18 DO (single) vs. DN+bridging; generic-value 52
## 19 DO (single) vs. DN+bridging; generic-value; category 10
## 20 DO (single) vs. DN; category 11
## 21 DO (single) vs. DN; generic-value 69
## 22 DO (single) vs. DN; generic-value; category 7
## 23 DO+bridging (single) vs. DN 3
## 24 DO+bridging (single) vs. DN+bridging; ambiguous relation; category 1
## 25 DO+bridging (single) vs. DN; generic-value 1
## 26 antecedent vs. deixis 13
## 27 antecedent vs. deixis+bridging 1
## 28 antecedent vs. deixis; category 13
## 29 antecedent vs. deixis; generic-value 1
## 30 deixis vs. DN 1
## 31 deixis vs. DN+bridging 4
## 32 deixis vs. DN+bridging; category 1
## 33 deixis vs. DN+bridging; generic-value 1
## 34 identifying the right antecedent 63
## 35 identifying the right antecedent + bridging in 2 1
## 36 identifying the right antecedent for bridging (DN) 3
## 37 identifying the right antecedent for bridging (DN) and ambiguous relation 1
## 38 identifying the right antecedent for bridging (DN); generic-value 1
## 39 identifying the right antecedent; category 11
## 40 identifying the right antecedent; generic-value 13
## 41 identifying the right antecedent; generic-value; category 3
## 42 identifying the right antecedents 4
## 43 identifying the right antecedents; category 1
## 44 identifying the right segment for deixis 21
## 45 identifying the right segment for deixis + related_object in 1 1
## 46 referring vs. non_referring 40
## 47 referring vs. undefined_reference 5
## 48 single vs. multiple antecedents 15
## 49 single vs. multiple antecedents; category 5
## 50 single vs. multiple antecedents; generic-value 8
## 51 single vs. multiple antecedents; generic-value; category 1
## percentage
## 1 0.1703578
## 2 0.6814310
## 3 1.1925043
## 4 0.1703578
## 5 0.5110733
## 6 3.0664395
## 7 0.1703578
## 8 0.3407155
## 9 0.1703578
## 10 0.3407155
## 11 0.6814310
## 12 0.1703578
## 13 0.1703578
## 14 0.1703578
## 15 6.8143101
## 16 15.8432709
## 17 3.5775128
## 18 8.8586031
## 19 1.7035775
## 20 1.8739353
## 21 11.7546848
## 22 1.1925043
## 23 0.5110733
## 24 0.1703578
## 25 0.1703578
## 26 2.2146508
## 27 0.1703578
## 28 2.2146508
## 29 0.1703578
## 30 0.1703578
## 31 0.6814310
## 32 0.1703578
## 33 0.1703578
## 34 10.7325383
## 35 0.1703578
## 36 0.5110733
## 37 0.1703578
## 38 0.1703578
## 39 1.8739353
## 40 2.2146508
## 41 0.5110733
## 42 0.6814310
## 43 0.1703578
## 44 3.5775128
## 45 0.1703578
## 46 6.8143101
## 47 0.8517888
## 48 2.5553663
## 49 0.8517888
## 50 1.3628620
## 51 0.1703578
## amb_reason_fin
## 1 DN vs. DN+bridging; category
## 2 DN vs. DN+bridging; generic-value
## 3 DN vs. DN+bridging; generic-value; category
## 4 DN vs. DN, no related objects; category
## 5 DO (multiple) vs. DN+bridging; generic-value
## 6 DO (multiple) vs. DN+bridging; generic-value; category
## 7 DO (multiple) vs. DN; category
## 8 DO (single) vs. DN
## 9 DO (single) vs. DN+bridging
## 10 DO (single) vs. DN+bridging; category
## 11 DO (single) vs. DN+bridging; generic-value
## 12 DO (single) vs. DN; category
## 13 DO (single) vs. DN; generic-value
## 14 DO (single) vs. DN; generic-value; category
## 15 DO+bridging (single) vs. DN
## 16 DO+bridging (single) vs. DN+bridging
## 17 DO+bridging (single) vs. DN+bridging; ambiguous relation; category
## 18 antecedent vs. deixis
## 19 antecedent vs. deixis; category
## 20 antecedent+bridging vs. deixis
## 21 deixis vs. DN+bridging
## 22 deixis vs. DN+bridging; category
## 23 identifying the right antecedent
## 24 identifying the right antecedent + bridging in 1
## 25 identifying the right antecedent + bridging in 1; category
## 26 identifying the right antecedent + bridging in 2; generic-value
## 27 identifying the right antecedent for bridging (DN)
## 28 identifying the right antecedent for bridging (DN) and ambiguous relation
## 29 identifying the right antecedent for bridging (DN) and ambiguous relation; generic-value
## 30 identifying the right antecedent; category
## 31 identifying the right antecedent; generic-value
## 32 identifying the right antecedent; generic-value; category
## 33 identifying the right antecedents
## 34 identifying the right antecedents; category
## 35 identifying the right segment for deixis; category
## 36 referring vs. non_referring
## 37 single vs. multiple antecedents
## 38 single vs. multiple antecedents; generic-value
## n percentage dataset
## 1 1 0.3289474 Gutenberg
## 2 2 0.6578947 Gutenberg
## 3 1 0.3289474 Gutenberg
## 4 1 0.3289474 Gutenberg
## 5 2 0.6578947 Gutenberg
## 6 1 0.3289474 Gutenberg
## 7 1 0.3289474 Gutenberg
## 8 37 12.1710526 Gutenberg
## 9 69 22.6973684 Gutenberg
## 10 9 2.9605263 Gutenberg
## 11 5 1.6447368 Gutenberg
## 12 18 5.9210526 Gutenberg
## 13 2 0.6578947 Gutenberg
## 14 1 0.3289474 Gutenberg
## 15 1 0.3289474 Gutenberg
## 16 1 0.3289474 Gutenberg
## 17 1 0.3289474 Gutenberg
## 18 3 0.9868421 Gutenberg
## 19 2 0.6578947 Gutenberg
## 20 1 0.3289474 Gutenberg
## 21 1 0.3289474 Gutenberg
## 22 1 0.3289474 Gutenberg
## 23 32 10.5263158 Gutenberg
## 24 1 0.3289474 Gutenberg
## 25 1 0.3289474 Gutenberg
## 26 1 0.3289474 Gutenberg
## 27 1 0.3289474 Gutenberg
## 28 1 0.3289474 Gutenberg
## 29 1 0.3289474 Gutenberg
## 30 6 1.9736842 Gutenberg
## 31 2 0.6578947 Gutenberg
## 32 2 0.6578947 Gutenberg
## 33 1 0.3289474 Gutenberg
## 34 1 0.3289474 Gutenberg
## 35 1 0.3289474 Gutenberg
## 36 88 28.9473684 Gutenberg
## 37 3 0.9868421 Gutenberg
## 38 1 0.3289474 Gutenberg
## amb_reason_fin n
## 1 DN vs. DN+bridging; category 6
## 2 DN vs. DN, no related objects; category 4
## 3 DN vs. DN, no related objects; generic-value 1
## 4 DN vs. DN, no related objects; generic-value; category 4
## 5 DN vs. DN; ambiguous relation 1
## 6 DO (multiple) vs. DN 1
## 7 DO (multiple) vs. DN+bridging; category 2
## 8 DO (single) vs. DN 38
## 9 DO (single) vs. DN+bridging 97
## 10 DO (single) vs. DN+bridging; category 9
## 11 DO (single) vs. DN+bridging; generic-value 21
## 12 DO (single) vs. DN+bridging; generic-value; category 4
## 13 DO (single) vs. DN; category 34
## 14 DO (single) vs. DN; generic-value 13
## 15 DO (single) vs. DN; generic-value; category 3
## 16 DO+bridging (single) vs. DN+bridging; ambiguous relation 1
## 17 antecedent vs. deixis 5
## 18 deixis vs. DN 1
## 19 deixis vs. DN+bridging 7
## 20 deixis vs. DN+bridging; category 1
## 21 identifying the right antecedent 36
## 22 identifying the right antecedent + bridging in 1 2
## 23 identifying the right antecedent + bridging in 2; generic-value 1
## 24 identifying the right antecedent for bridging (DN) 1
## 25 identifying the right antecedent; category 26
## 26 identifying the right antecedent; generic-value 8
## 27 referring vs. non_referring 154
## 28 single vs. multiple antecedents 6
## 29 single vs. multiple antecedents; category 2
## percentage dataset
## 1 1.2269939 Wikipedia
## 2 0.8179959 Wikipedia
## 3 0.2044990 Wikipedia
## 4 0.8179959 Wikipedia
## 5 0.2044990 Wikipedia
## 6 0.2044990 Wikipedia
## 7 0.4089980 Wikipedia
## 8 7.7709611 Wikipedia
## 9 19.8364008 Wikipedia
## 10 1.8404908 Wikipedia
## 11 4.2944785 Wikipedia
## 12 0.8179959 Wikipedia
## 13 6.9529652 Wikipedia
## 14 2.6584867 Wikipedia
## 15 0.6134969 Wikipedia
## 16 0.2044990 Wikipedia
## 17 1.0224949 Wikipedia
## 18 0.2044990 Wikipedia
## 19 1.4314928 Wikipedia
## 20 0.2044990 Wikipedia
## 21 7.3619632 Wikipedia
## 22 0.4089980 Wikipedia
## 23 0.2044990 Wikipedia
## 24 0.2044990 Wikipedia
## 25 5.3169734 Wikipedia
## 26 1.6359918 Wikipedia
## 27 31.4928425 Wikipedia
## 28 1.2269939 Wikipedia
## 29 0.4089980 Wikipedia
#### Comparison Gutenberg and Wikipedia
The plot compares the distribution of reasons for ambiguity between the two domains of PD. If only one bar is shown per label, the percentage of it in the other domain is below the threshold of 0.8%
## amb_reason_fin
## 1 DN vs. DN+bridging; category
## 2 DN vs. DN+bridging; generic-value
## 3 DN vs. DN+bridging; generic-value; category
## 4 DN vs. DN, no related objects; category
## 5 DN vs. DN, no related objects; generic-value
## 6 DN vs. DN, no related objects; generic-value; category
## 7 DN vs. DN; ambiguous relation
## 8 DO (multiple) vs. DN
## 9 DO (multiple) vs. DN+bridging; category
## 10 DO (multiple) vs. DN+bridging; generic-value
## 11 DO (multiple) vs. DN+bridging; generic-value; category
## 12 DO (multiple) vs. DN; category
## 13 DO (single) vs. DN
## 14 DO (single) vs. DN+bridging
## 15 DO (single) vs. DN+bridging; category
## 16 DO (single) vs. DN+bridging; generic-value
## 17 DO (single) vs. DN+bridging; generic-value; category
## 18 DO (single) vs. DN; category
## 19 DO (single) vs. DN; generic-value
## 20 DO (single) vs. DN; generic-value; category
## 21 DO+bridging (single) vs. DN
## 22 DO+bridging (single) vs. DN+bridging
## 23 DO+bridging (single) vs. DN+bridging; ambiguous relation
## 24 DO+bridging (single) vs. DN+bridging; ambiguous relation; category
## 25 antecedent vs. deixis
## 26 antecedent vs. deixis; category
## 27 antecedent+bridging vs. deixis
## 28 deixis vs. DN
## 29 deixis vs. DN+bridging
## 30 deixis vs. DN+bridging; category
## 31 identifying the right antecedent
## 32 identifying the right antecedent + bridging in 1
## 33 identifying the right antecedent + bridging in 1; category
## 34 identifying the right antecedent + bridging in 2; generic-value
## 35 identifying the right antecedent for bridging (DN)
## 36 identifying the right antecedent for bridging (DN) and ambiguous relation
## 37 identifying the right antecedent for bridging (DN) and ambiguous relation; generic-value
## 38 identifying the right antecedent; category
## 39 identifying the right antecedent; generic-value
## 40 identifying the right antecedent; generic-value; category
## 41 identifying the right antecedents
## 42 identifying the right antecedents; category
## 43 identifying the right segment for deixis; category
## 44 referring vs. non_referring
## 45 single vs. multiple antecedents
## 46 single vs. multiple antecedents; category
## 47 single vs. multiple antecedents; generic-value
## n percentage
## 1 7 0.8827238
## 2 2 0.2522068
## 3 1 0.1261034
## 4 5 0.6305170
## 5 1 0.1261034
## 6 4 0.5044136
## 7 1 0.1261034
## 8 1 0.1261034
## 9 2 0.2522068
## 10 2 0.2522068
## 11 1 0.1261034
## 12 1 0.1261034
## 13 75 9.4577554
## 14 166 20.9331652
## 15 18 2.2698613
## 16 26 3.2786885
## 17 4 0.5044136
## 18 52 6.5573770
## 19 15 1.8915511
## 20 4 0.5044136
## 21 1 0.1261034
## 22 1 0.1261034
## 23 1 0.1261034
## 24 1 0.1261034
## 25 8 1.0088272
## 26 2 0.2522068
## 27 1 0.1261034
## 28 1 0.1261034
## 29 8 1.0088272
## 30 2 0.2522068
## 31 68 8.5750315
## 32 3 0.3783102
## 33 1 0.1261034
## 34 2 0.2522068
## 35 2 0.2522068
## 36 1 0.1261034
## 37 1 0.1261034
## 38 32 4.0353090
## 39 10 1.2610340
## 40 2 0.2522068
## 41 1 0.1261034
## 42 1 0.1261034
## 43 1 0.1261034
## 44 242 30.5170240
## 45 9 1.1349306
## 46 2 0.2522068
## 47 1 0.1261034
# define pronouns (personal, possessive)
pronouns <- "\\b(i|you|he|she|it|we|they|me|him|her|us|them|my|your|his|its|our|their|mine|yours|hers|ours|theirs)\\b"
reflexives <- "\\b(myself|yourself|himself|herself|itself|ourselves|yourselves|themselves)\\b"
# filter
df_categorised_pronouns <- df_categorised_fin %>%
filter(grepl(pronouns, min_words, ignore.case = TRUE))
#df_categorised_reflexive <- df_categorised_fin %>%
#filter(grepl(reflexives, min_words, ignore.case = TRUE))
# filter for everything that is NOT a pronoun
df_categorised_not_pronouns <- df_categorised_fin %>%
filter(!grepl(pronouns, reflexives, min_words, ignore.case = TRUE))
#print(df_categorised_pronouns)
print("List of labels and their count in ARRAU + PD for pronouns:")
## [1] "List of labels and their count in ARRAU + PD for pronouns:"
table(df_categorised_pronouns$amb_reason_fin)
##
## antecedent vs. deixis
## 4
## antecedent vs. deixis; category
## 2
## DO (multiple) vs. DN; category
## 1
## DO (multiple) vs. DN+bridging
## 1
## DO (multiple) vs. DN+bridging; generic-value
## 2
## DO (single) vs. DN
## 4
## DO (single) vs. DN; category
## 10
## DO (single) vs. DN; generic-value
## 4
## DO (single) vs. DN+bridging
## 3
## DO (single) vs. DN+bridging; category
## 1
## DO (single) vs. DN+bridging; generic-value; category
## 1
## DO+bridging (single) vs. DN; generic-value
## 1
## identifying the right antecedent
## 45
## identifying the right antecedent; category
## 15
## identifying the right antecedent; generic-value
## 7
## identifying the right antecedent; generic-value; category
## 4
## identifying the right antecedents
## 3
## identifying the right antecedents; category
## 1
## identifying the right segment for deixis
## 2
## referring vs. non_referring
## 57
## referring vs. undefined_reference
## 5
## single vs. multiple antecedents
## 18
## single vs. multiple antecedents; category
## 5
## single vs. multiple antecedents; generic-value
## 7
## single vs. multiple antecedents; generic-value; category
## 1
#table(df_categorised_pronouns$gram_fnc)
#table(df_categorised_pronouns$number)
## min_words min_ids id file_name span
## 1 they word_652 markable_294 wsjarrau_2308_phrase_level.csv word_652
## 2 we word_220 markable_126 wsjarrau_2321_phrase_level.csv word_220
## 3 it word_769 markable_148 dia8-2_phrase_level.csv word_769
## 4 they word_2928 markable_14577 mrcnf10j-game_phrase_level.csv word_2928
## mmax_level dataset
## 1 phrase RST
## 2 phrase RST
## 3 phrase Trains_91
## 4 phrase GB
## comment number gender
## 1 ambiguity: boeing + union vs. boeing + strikers at boeing plur undersp-gen
## 2 mp: highly ambiguous plur undersp-gen
## 3 <NA> sing neuter
## 4 <NA> plur undersp-gen
## person gram_fnc ambiguity category category_2 generic generic_2
## 1 per3 subj ambiguous organization organization generic-no generic-no
## 2 per1 subj ambiguous organization person generic-no generic-no
## 3 per3 obj ambiguous concrete concrete generic-no generic-no
## 4 per3 unmarked ambiguous person person generic-no generic-no
## multiple_phrase_antecedents
## 1 markable_272;markable_163
## 2 markable_405;markable_94
## 3 markable_147
## 4 markable_14575;markable_146040147
## multiple_phrase_antecedents_2 non_ref_type non_ref_type_2
## 1 markable_176;markable_272 <NA> <NA>
## 2 markable_124;markable_125 <NA> <NA>
## 3 markable_147;markable_132 <NA> <NA>
## 4 markable_14575;markable_146040147;markable_14475 <NA> <NA>
## object object_2 phrase_antecedent phrase_antecedent_2 ref_type ref_type_2
## 1 <NA> <NA> multiple_phrases multiple_phrases phrase phrase
## 2 acc <NA> multiple_phrases multiple_phrases phrase phrase
## 3 <NA> <NA> multiple_phrases multiple_phrases phrase phrase
## 4 <NA> <NA> multiple_phrases multiple_phrases phrase phrase
## reference related_object related_object_2 related_phrase related_phrase_2
## 1 old no no <NA> <NA>
## 2 old no no <NA> <NA>
## 3 old no no <NA> <NA>
## 4 old no no <NA> <NA>
## related_rel related_rel_2 segment_antecedent segment_antecedent_2
## 1 <NA> <NA> <NA> <NA>
## 2 <NA> <NA> <NA> <NA>
## 3 <NA> <NA> <NA> <NA>
## 4 <NA> <NA> <NA> <NA>
## single_phrase_antecedent single_phrase_antecedent_2 info_status_1
## 1 <NA> <NA> DO
## 2 <NA> <NA> DO
## 3 <NA> <NA> DO
## 4 <NA> <NA> DO
## info_status_2 amb_generic amb_category amb_bridging
## 1 DO no no no
## 2 DO no yes no
## 3 DO no no no
## 4 DO no no no
## amb_reason_pre amb_reason_gen amb_reason_cat
## 1 identifying the right antecedents
## 2 identifying the right antecedents category
## 3 identifying the right antecedents
## 4 identifying the right antecedents
## amb_reason_fin
## 1 identifying the right antecedents
## 2 identifying the right antecedents; category
## 3 identifying the right antecedents
## 4 identifying the right antecedents
## amb_reason_fin n percentage
## 1 DO (single) vs. DN 1 1.052632
## 2 antecedent vs. deixis 4 4.210526
## 3 antecedent vs. deixis; category 2 2.105263
## 4 identifying the right antecedent 16 16.842105
## 5 identifying the right antecedent; category 7 7.368421
## 6 identifying the right antecedent; generic-value 5 5.263158
## 7 identifying the right antecedent; generic-value; category 2 2.105263
## 8 identifying the right antecedents 1 1.052632
## 9 identifying the right segment for deixis 2 2.105263
## 10 referring vs. non_referring 33 34.736842
## 11 referring vs. undefined_reference 1 1.052632
## 12 single vs. multiple antecedents 13 13.684211
## 13 single vs. multiple antecedents; category 2 2.105263
## 14 single vs. multiple antecedents; generic-value 5 5.263158
## 15 single vs. multiple antecedents; generic-value; category 1 1.052632
## Pronoun
## 1 it
## 2 it
## 3 it
## 4 it
## 5 it
## 6 it
## 7 it
## 8 it
## 9 it
## 10 it
## 11 it
## 12 it
## 13 it
## 14 it
## 15 it
## amb_reason_fin n percentage
## 1 DO (multiple) vs. DN+bridging 1 0.9174312
## 2 DO (multiple) vs. DN+bridging; generic-value 2 1.8348624
## 3 DO (multiple) vs. DN; category 1 0.9174312
## 4 DO (single) vs. DN 3 2.7522936
## 5 DO (single) vs. DN+bridging 3 2.7522936
## 6 DO (single) vs. DN+bridging; category 1 0.9174312
## 7 DO (single) vs. DN+bridging; generic-value; category 1 0.9174312
## 8 DO (single) vs. DN; category 10 9.1743119
## 9 DO (single) vs. DN; generic-value 4 3.6697248
## 10 DO+bridging (single) vs. DN; generic-value 1 0.9174312
## 11 identifying the right antecedent 29 26.6055046
## 12 identifying the right antecedent; category 8 7.3394495
## 13 identifying the right antecedent; generic-value 2 1.8348624
## 14 identifying the right antecedent; generic-value; category 2 1.8348624
## 15 identifying the right antecedents 2 1.8348624
## 16 identifying the right antecedents; category 1 0.9174312
## 17 referring vs. non_referring 24 22.0183486
## 18 referring vs. undefined_reference 4 3.6697248
## 19 single vs. multiple antecedents 5 4.5871560
## 20 single vs. multiple antecedents; category 3 2.7522936
## 21 single vs. multiple antecedents; generic-value 2 1.8348624
## Pronoun
## 1 Other
## 2 Other
## 3 Other
## 4 Other
## 5 Other
## 6 Other
## 7 Other
## 8 Other
## 9 Other
## 10 Other
## 11 Other
## 12 Other
## 13 Other
## 14 Other
## 15 Other
## 16 Other
## 17 Other
## 18 Other
## 19 Other
## 20 Other
## 21 Other
#plural pronouns ------------------> to be done!!
#check how plural pronouns are annotated for "phrase antecedent"
List of unique labels:
## amb_reason_fin
## 1 DN vs. DN+bridging
## 2 DN vs. DN; ambiguous relation
## 3 DN vs. DN; generic-value
## 4 DO (multiple) vs. DN; category
## 5 DO (multiple) vs. DN; generic-value
## 6 DO+bridging (single) vs. DN+bridging
## 7 DO+bridging (single) vs. DN+bridging; ambiguous relation
## 8 DO+bridging (single) vs. DN; generic-value
## 9 antecedent vs. deixis+bridging
## 10 antecedent vs. deixis; generic-value
## 11 antecedent+bridging vs. deixis
## 12 deixis vs. DN+bridging; generic-value
## 13 identifying the right antecedent + bridging in 1; category
## 14 identifying the right antecedent + bridging in 2
## 15 identifying the right antecedent for bridging (DN) and ambiguous relation; generic-value
## 16 identifying the right antecedent for bridging (DN); generic-value
## 17 identifying the right segment for deixis + related_object in 1
## 18 identifying the right segment for deixis; category
## 19 single vs. multiple antecedents; generic-value; category
## n
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 1
## 8 1
## 9 1
## 10 1
## 11 1
## 12 1
## 13 1
## 14 1
## 15 1
## 16 1
## 17 1
## 18 1
## 19 1
List of rare labels (n > 1 & n <= 5):
## amb_reason_fin n
## 1 DN vs. DN+bridging; generic-value; category 2
## 2 DN vs. DN, no related objects; generic-value; category 5
## 3 DN vs. DN; ambiguous relation; generic-value 2
## 4 DO (multiple) vs. DN 3
## 5 DO (multiple) vs. DN+bridging 4
## 6 DO (multiple) vs. DN+bridging; category 2
## 7 DO (multiple) vs. DN+bridging; generic-value 3
## 8 DO (multiple) vs. DN+bridging; generic-value; category 2
## 9 DO+bridging (single) vs. DN 4
## 10 DO+bridging (single) vs. DN+bridging; ambiguous relation; category 2
## 11 deixis vs. DN 2
## 12 deixis vs. DN+bridging; category 3
## 13 identifying the right antecedent + bridging in 1 3
## 14 identifying the right antecedent + bridging in 2; generic-value 2
## 15 identifying the right antecedent for bridging (DN) 5
## 16 identifying the right antecedent for bridging (DN) and ambiguous relation 2
## 17 identifying the right antecedent; generic-value; category 5
## 18 identifying the right antecedents 5
## 19 identifying the right antecedents; category 2
## 20 referring vs. undefined_reference 5